{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 任务目标：利用异烟酸生产过程中的各参数，预测最终异烟酸的收率\n",
    "<ul>\n",
    "    <li>数据集包括工程中10各步骤的参数，样本id、A1-A28、B1-B14包括原料、辅料、时间、温度、压强以及收率。\n",
    "    <li>冠军ATCG解决方案\n",
    "</ul>\n",
    "\n",
    "**预测具体的值：回归任务**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"assets/20201130145942327.png\" width=\"100%\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import mean_squared_error as mse\n",
    "\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"导入数据集\"\"\"\n",
    "df_trn = pd.read_csv('jinnan_round1_train_20181227.csv', encoding='GB2312')   # encoding进行编码\n",
    "df_tst_a = pd.read_csv('jinnan_round1_testA_20181227.csv', encoding='GB2312')\n",
    "df_tst_b = pd.read_csv('jinnan_round1_testB_20190121.csv', encoding='GB2312') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9</th>\n",
       "      <th>...</th>\n",
       "      <th>B6</th>\n",
       "      <th>B7</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9</th>\n",
       "      <th>B10</th>\n",
       "      <th>B11</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>13:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15:30:00</td>\n",
       "      <td>...</td>\n",
       "      <td>65</td>\n",
       "      <td>11:30:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>11:30-13:00</td>\n",
       "      <td>14:00-15:30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>6:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>6:00-7:30</td>\n",
       "      <td>7:30-9:00</td>\n",
       "      <td>9:00-10:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>1:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1:00-2:30</td>\n",
       "      <td>2:30-4:00</td>\n",
       "      <td>4:00-5:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>1:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>65</td>\n",
       "      <td>18:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>19:00-20:30</td>\n",
       "      <td>21:30-23:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>9:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>9:00-10:30</td>\n",
       "      <td>10:30-12:00</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>420</td>\n",
       "      <td>0.983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 44 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4        A5    A6   A7  A8        A9  ...  \\\n",
       "0  sample_1528  300 NaN  405.0  700  13:30:00  38.0  NaN NaN  15:30:00  ...   \n",
       "1  sample_1698  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00  ...   \n",
       "2   sample_639  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00  ...   \n",
       "3   sample_483  300 NaN  405.0  700   1:30:00  38.0  NaN NaN   3:00:00  ...   \n",
       "4   sample_617  300 NaN  405.0  700  22:00:00  29.0  NaN NaN   0:00:00  ...   \n",
       "\n",
       "   B6        B7    B8           B9          B10          B11     B12   B13  \\\n",
       "0  65  11:30:00  45.0  11:30-13:00  14:00-15:30          NaN   800.0  0.15   \n",
       "1  80   6:00:00  45.0    6:00-7:30    7:30-9:00   9:00-10:00  1200.0  0.15   \n",
       "2  80   1:00:00  45.0    1:00-2:30    2:30-4:00    4:00-5:00  1200.0  0.15   \n",
       "3  65  18:00:00  45.0  19:00-20:30  21:30-23:00          NaN   800.0  0.15   \n",
       "4  80   9:00:00  45.0   9:00-10:30  10:30-12:00  12:00-13:00  1200.0  0.15   \n",
       "\n",
       "   B14     收率  \n",
       "0  400  0.879  \n",
       "1  400  0.902  \n",
       "2  400  0.936  \n",
       "3  400  0.902  \n",
       "4  420  0.983  \n",
       "\n",
       "[5 rows x 44 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 观察数据\n",
    "df_trn.head()  # 可以发现A2、A7等有NaN缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1396 entries, 0 to 1395\n",
      "Data columns (total 44 columns):\n",
      "样本id    1396 non-null object\n",
      "A1      1396 non-null int64\n",
      "A2      42 non-null float64\n",
      "A3      1354 non-null float64\n",
      "A4      1396 non-null int64\n",
      "A5      1396 non-null object\n",
      "A6      1396 non-null float64\n",
      "A7      149 non-null object\n",
      "A8      149 non-null float64\n",
      "A9      1396 non-null object\n",
      "A10     1396 non-null int64\n",
      "A11     1396 non-null object\n",
      "A12     1396 non-null int64\n",
      "A13     1396 non-null float64\n",
      "A14     1396 non-null object\n",
      "A15     1396 non-null float64\n",
      "A16     1396 non-null object\n",
      "A17     1396 non-null float64\n",
      "A18     1396 non-null float64\n",
      "A19     1396 non-null int64\n",
      "A20     1396 non-null object\n",
      "A21     1393 non-null float64\n",
      "A22     1396 non-null float64\n",
      "A23     1393 non-null float64\n",
      "A24     1395 non-null object\n",
      "A25     1396 non-null object\n",
      "A26     1394 non-null object\n",
      "A27     1396 non-null int64\n",
      "A28     1396 non-null object\n",
      "B1      1386 non-null float64\n",
      "B2      1394 non-null float64\n",
      "B3      1394 non-null float64\n",
      "B4      1396 non-null object\n",
      "B5      1395 non-null object\n",
      "B6      1396 non-null int64\n",
      "B7      1396 non-null object\n",
      "B8      1395 non-null float64\n",
      "B9      1396 non-null object\n",
      "B10     1152 non-null object\n",
      "B11     547 non-null object\n",
      "B12     1395 non-null float64\n",
      "B13     1395 non-null float64\n",
      "B14     1396 non-null int64\n",
      "收率      1396 non-null float64\n",
      "dtypes: float64(18), int64(8), object(18)\n",
      "memory usage: 480.0+ KB\n"
     ]
    }
   ],
   "source": [
    "df_trn.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 如何确定字段需要处理\n",
    "我们需要解决一些异常值，如某值相对其它值过大的离群点"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A6</th>\n",
       "      <th>A8</th>\n",
       "      <th>A10</th>\n",
       "      <th>A12</th>\n",
       "      <th>A13</th>\n",
       "      <th>A15</th>\n",
       "      <th>A17</th>\n",
       "      <th>A18</th>\n",
       "      <th>A19</th>\n",
       "      <th>A21</th>\n",
       "      <th>A22</th>\n",
       "      <th>A23</th>\n",
       "      <th>A27</th>\n",
       "      <th>B1</th>\n",
       "      <th>B2</th>\n",
       "      <th>B3</th>\n",
       "      <th>B6</th>\n",
       "      <th>B8</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1396.000000</td>\n",
       "      <td>42.0</td>\n",
       "      <td>1354.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>149.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1393.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1393.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1386.000000</td>\n",
       "      <td>1394.000000</td>\n",
       "      <td>1394.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1395.000000</td>\n",
       "      <td>1395.000000</td>\n",
       "      <td>1395.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "      <td>1396.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>298.853868</td>\n",
       "      <td>125.0</td>\n",
       "      <td>403.515510</td>\n",
       "      <td>705.974212</td>\n",
       "      <td>28.287751</td>\n",
       "      <td>78.818792</td>\n",
       "      <td>100.861032</td>\n",
       "      <td>102.641834</td>\n",
       "      <td>0.199907</td>\n",
       "      <td>103.829370</td>\n",
       "      <td>104.766905</td>\n",
       "      <td>0.199928</td>\n",
       "      <td>231.067335</td>\n",
       "      <td>48.707825</td>\n",
       "      <td>9.117120</td>\n",
       "      <td>5.002872</td>\n",
       "      <td>74.396848</td>\n",
       "      <td>334.452742</td>\n",
       "      <td>3.454412</td>\n",
       "      <td>3.500072</td>\n",
       "      <td>72.065186</td>\n",
       "      <td>43.709677</td>\n",
       "      <td>1020.215054</td>\n",
       "      <td>0.149419</td>\n",
       "      <td>410.403295</td>\n",
       "      <td>0.923244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>10.130552</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.348093</td>\n",
       "      <td>53.214754</td>\n",
       "      <td>6.742765</td>\n",
       "      <td>2.683920</td>\n",
       "      <td>0.905198</td>\n",
       "      <td>0.915387</td>\n",
       "      <td>0.002524</td>\n",
       "      <td>0.963639</td>\n",
       "      <td>1.401446</td>\n",
       "      <td>0.002676</td>\n",
       "      <td>50.478071</td>\n",
       "      <td>4.976531</td>\n",
       "      <td>0.369152</td>\n",
       "      <td>0.136638</td>\n",
       "      <td>3.044490</td>\n",
       "      <td>105.120753</td>\n",
       "      <td>0.388585</td>\n",
       "      <td>0.002678</td>\n",
       "      <td>9.161986</td>\n",
       "      <td>4.338396</td>\n",
       "      <td>205.920155</td>\n",
       "      <td>0.008213</td>\n",
       "      <td>26.018410</td>\n",
       "      <td>0.030880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>200.000000</td>\n",
       "      <td>125.0</td>\n",
       "      <td>270.000000</td>\n",
       "      <td>470.000000</td>\n",
       "      <td>17.000000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>98.000000</td>\n",
       "      <td>0.120000</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>89.000000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>0.150000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>40.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>400.000000</td>\n",
       "      <td>0.030000</td>\n",
       "      <td>40.000000</td>\n",
       "      <td>0.624000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>300.000000</td>\n",
       "      <td>125.0</td>\n",
       "      <td>405.000000</td>\n",
       "      <td>700.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>102.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>103.000000</td>\n",
       "      <td>104.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>200.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>73.000000</td>\n",
       "      <td>320.000000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>65.000000</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>800.000000</td>\n",
       "      <td>0.150000</td>\n",
       "      <td>400.000000</td>\n",
       "      <td>0.902000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>300.000000</td>\n",
       "      <td>125.0</td>\n",
       "      <td>405.000000</td>\n",
       "      <td>700.000000</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>101.000000</td>\n",
       "      <td>103.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>104.000000</td>\n",
       "      <td>105.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>200.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>73.000000</td>\n",
       "      <td>320.000000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>78.000000</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>1200.000000</td>\n",
       "      <td>0.150000</td>\n",
       "      <td>400.000000</td>\n",
       "      <td>0.925000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>300.000000</td>\n",
       "      <td>125.0</td>\n",
       "      <td>405.000000</td>\n",
       "      <td>700.000000</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>102.000000</td>\n",
       "      <td>103.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>104.000000</td>\n",
       "      <td>105.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>77.000000</td>\n",
       "      <td>330.000000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>1200.000000</td>\n",
       "      <td>0.150000</td>\n",
       "      <td>420.000000</td>\n",
       "      <td>0.943000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>300.000000</td>\n",
       "      <td>125.0</td>\n",
       "      <td>405.000000</td>\n",
       "      <td>980.000000</td>\n",
       "      <td>97.000000</td>\n",
       "      <td>82.000000</td>\n",
       "      <td>103.000000</td>\n",
       "      <td>107.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>109.000000</td>\n",
       "      <td>108.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>350.000000</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>1200.000000</td>\n",
       "      <td>3.600000</td>\n",
       "      <td>3.600000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>73.000000</td>\n",
       "      <td>1200.000000</td>\n",
       "      <td>0.150000</td>\n",
       "      <td>460.000000</td>\n",
       "      <td>1.000800</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                A1     A2           A3           A4           A6          A8  \\\n",
       "count  1396.000000   42.0  1354.000000  1396.000000  1396.000000  149.000000   \n",
       "mean    298.853868  125.0   403.515510   705.974212    28.287751   78.818792   \n",
       "std      10.130552    0.0    13.348093    53.214754     6.742765    2.683920   \n",
       "min     200.000000  125.0   270.000000   470.000000    17.000000   70.000000   \n",
       "25%     300.000000  125.0   405.000000   700.000000    24.000000   80.000000   \n",
       "50%     300.000000  125.0   405.000000   700.000000    29.000000   80.000000   \n",
       "75%     300.000000  125.0   405.000000   700.000000    30.000000   80.000000   \n",
       "max     300.000000  125.0   405.000000   980.000000    97.000000   82.000000   \n",
       "\n",
       "               A10          A12          A13          A15          A17  \\\n",
       "count  1396.000000  1396.000000  1396.000000  1396.000000  1396.000000   \n",
       "mean    100.861032   102.641834     0.199907   103.829370   104.766905   \n",
       "std       0.905198     0.915387     0.002524     0.963639     1.401446   \n",
       "min     100.000000    98.000000     0.120000   100.000000    89.000000   \n",
       "25%     100.000000   102.000000     0.200000   103.000000   104.000000   \n",
       "50%     101.000000   103.000000     0.200000   104.000000   105.000000   \n",
       "75%     102.000000   103.000000     0.200000   104.000000   105.000000   \n",
       "max     103.000000   107.000000     0.200000   109.000000   108.000000   \n",
       "\n",
       "               A18          A19          A21          A22          A23  \\\n",
       "count  1396.000000  1396.000000  1393.000000  1396.000000  1393.000000   \n",
       "mean      0.199928   231.067335    48.707825     9.117120     5.002872   \n",
       "std       0.002676    50.478071     4.976531     0.369152     0.136638   \n",
       "min       0.100000   100.000000    20.000000     3.500000     4.000000   \n",
       "25%       0.200000   200.000000    50.000000     9.000000     5.000000   \n",
       "50%       0.200000   200.000000    50.000000     9.000000     5.000000   \n",
       "75%       0.200000   300.000000    50.000000     9.000000     5.000000   \n",
       "max       0.200000   350.000000    90.000000    10.000000    10.000000   \n",
       "\n",
       "               A27           B1           B2           B3           B6  \\\n",
       "count  1396.000000  1386.000000  1394.000000  1394.000000  1396.000000   \n",
       "mean     74.396848   334.452742     3.454412     3.500072    72.065186   \n",
       "std       3.044490   105.120753     0.388585     0.002678     9.161986   \n",
       "min      45.000000     3.500000     0.150000     3.500000    40.000000   \n",
       "25%      73.000000   320.000000     3.500000     3.500000    65.000000   \n",
       "50%      73.000000   320.000000     3.500000     3.500000    78.000000   \n",
       "75%      77.000000   330.000000     3.500000     3.500000    80.000000   \n",
       "max      80.000000  1200.000000     3.600000     3.600000    80.000000   \n",
       "\n",
       "                B8          B12          B13          B14           收率  \n",
       "count  1395.000000  1395.000000  1395.000000  1396.000000  1396.000000  \n",
       "mean     43.709677  1020.215054     0.149419   410.403295     0.923244  \n",
       "std       4.338396   205.920155     0.008213    26.018410     0.030880  \n",
       "min      20.000000   400.000000     0.030000    40.000000     0.624000  \n",
       "25%      45.000000   800.000000     0.150000   400.000000     0.902000  \n",
       "50%      45.000000  1200.000000     0.150000   400.000000     0.925000  \n",
       "75%      45.000000  1200.000000     0.150000   420.000000     0.943000  \n",
       "max      73.000000  1200.000000     0.150000   460.000000     1.000800  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pd.set_option('display.max_rows',100)#设置最大可见100行\n",
    "pd.set_option('display.max_columns',100) #给最大列设置为100列\n",
    "df_trn.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**观测点：**\n",
    "<ul>\n",
    "    <li>A5、A9等字段的describe没有了，而head()是有的，说明这些字段有问题\n",
    "    <li>理论上，std（方差也可以）越大表明特征间的差异越大，这样模型能学到区分性，但是过大可能是数据有离群值，B1、B12是需要关注的，再看其它值，B1里面最小值是3.5，25%/50%/75%都是320，3.5非常离群，而B12里最小值和中位数和最大值像是递进。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_abnormal_revise(data):\n",
    "    df_trn = data.copy()  # 复制一份数据，不改变原数据\n",
    "    df_trn.loc[(df_trn['A1'] == 200) & (df_trn['A3'] == 405), 'A1'] = 300\n",
    "    # A5会发现三个不合法值，比如1900/1/21 0:00可能要表达的是21:00:00，我们替换掉\n",
    "    df_trn['A5'] = df_trn['A5'].replace('1900/1/21 0:00', '21:00:00')\n",
    "    df_trn['A5'] = df_trn['A5'].replace('1900/1/29 0:00', '14:00:00')\n",
    "    df_trn['A9'] = df_trn['A9'].replace('1900/1/9 7:00', '23:00:00')\n",
    "    # A9有两个不合法值\n",
    "    df_trn['A9'] = df_trn['A9'].replace('1900/1/9 7:00', '23:00:00')\n",
    "    df_trn['A9'] = df_trn['A9'].replace('700', '7:00:00')\n",
    "    # A11有一个不合法值\n",
    "    df_trn['A11'] = df_trn['A11'].replace('1900/1/1 2:30', '2:30:00')\n",
    "    df_trn['A11'] = df_trn['A11'].replace(':30:00', '00:30:00')\n",
    "    df_trn['A16'] = df_trn['A16'].replace('1900/1/12 0:00', '12:00:00')\n",
    "    df_trn['A20'] = df_trn['A20'].replace('6:00-6:30分', '6:00-6:30')\n",
    "    df_trn['A20'] = df_trn['A20'].replace('18:30-15:00', '18:30-19:00')\n",
    "    # A22有个不合法值\n",
    "    df_trn['A22'] = df_trn['A22'].replace(3.5, np.nan)\n",
    "    df_trn['A25'] = df_trn['A25'].replace('1900/3/10 0:00', 70).astype(int)\n",
    "    df_trn['A26'] = df_trn['A26'].replace('1900/3/13 0:00', '13:00:00')\n",
    "    df_trn['B1'] = df_trn['B1'].replace(3.5, np.nan)\n",
    "    df_trn['B4'] = df_trn['B4'].replace('15:00-1600', '15:00-16:00')\n",
    "    df_trn['B4'] = df_trn['B4'].replace('18:00-17:00', '16:00-17:00')\n",
    "    df_trn['B4'] = df_trn['B4'].replace('19:-20:05', '19:05-20:05')\n",
    "    df_trn['B9'] = df_trn['B9'].replace('23:00-7:30', '23:00-00:30')\n",
    "    df_trn['B14'] = df_trn['B14'].replace(40, 400)\n",
    "    return df_trn\n",
    "\n",
    "\n",
    "def test_a_abnormal_revise(data):\n",
    "    df_tst = data.copy()\n",
    "    df_tst['A5'] = df_tst['A5'].replace('1900/1/22 0:00', '22:00:00')\n",
    "    df_tst['A7'] = df_tst['A7'].replace('0:50:00', '21:50:00')\n",
    "    df_tst['B14'] = df_tst['B14'].replace(785, 385)\n",
    "    return df_tst\n",
    "\n",
    "\n",
    "def train_abnormal_adjust(data):\n",
    "    df_trn = data.copy()\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1894', 'A5'] = '14:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1234', 'A9'] = '0:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1020', 'A9'] = '18:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1380', 'A11'] = '15:30:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_844', 'A11'] = '10:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1348', 'A11'] = '17:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_25', 'A11'] = '00:30:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1105', 'A11'] = '4:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_313', 'A11'] = '15:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_291', 'A14'] = '19:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1398', 'A16'] = '11:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1177', 'A20'] = '19:00-20:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_71', 'A20'] = '16:20-16:50'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_14', 'A20'] = '18:00-18:30'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_69', 'A20'] = '6:10-6:50'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1500', 'A20'] = '23:00-23:30'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1524', 'A24'] = '15:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1524', 'A26'] = '15:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1046', 'A28'] = '18:00-18:30'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1230', 'B5'] = '17:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_97', 'B7'] = '1:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_752', 'B9'] = '11:00-14:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_609', 'B11'] = '11:00-12:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_643', 'B11'] = '12:00-13:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1164', 'B11'] = '5:00-6:00'\n",
    "    return df_trn\n",
    "\n",
    "\n",
    "def test_a_abnormal_adjust(data):\n",
    "    df_tst = data.copy()\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_919', 'A9'] = '19:50:00'\n",
    "    return df_tst\n",
    "\n",
    "\n",
    "def test_b_abnormal_adjust(data):\n",
    "    df_tst = data.copy()\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_566', 'A5'] = '18:00:00'\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_40', 'A20'] = '5:00-5:30'\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_531', 'B5'] = '1:00'\n",
    "    return df_tst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trn = train_abnormal_revise(df_trn).pipe(train_abnormal_adjust)\n",
    "df_tst_a = test_a_abnormal_revise(df_tst_a).pipe(test_a_abnormal_adjust)\n",
    "df_tst_b = test_b_abnormal_adjust(df_tst_b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 标签与数据集整合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trn, df_tst = df_trn.copy(), df_tst_a.copy()\n",
    "df_target = df_trn['收率']  # 获取数据标签\n",
    "del df_trn['收率']  # 删除掉训练集的标签，即是训练数据\n",
    "df_trn_tst = df_trn.append(df_tst, ignore_index=False).reset_index(\n",
    "    drop=True)  # 把test合并一起，同时做操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "for _df in [df_trn, df_tst, df_trn_tst]:\n",
    "    _df['A3'] = _df['A3'].fillna(405)  # A3有缺失值，用众数填充"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 时间段特征处理 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 所有时间相关列\n",
    "cols_timer = ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']\n",
    "# 同时对训练和测试集进行相同处理\n",
    "for _df in [df_trn_tst, df_trn, df_tst]:\n",
    "    # 添加列名标记\n",
    "    _df.rename(columns={_col: _col + '_t' for _col in cols_timer},\n",
    "               inplace=True)\n",
    "    # 遍历所有持续时间相关列例如21:00-21:30\n",
    "    for _col in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']:\n",
    "        # 取到当前列的索引\n",
    "        _idx_col = _df.columns.tolist().index(_col)\n",
    "        # 添加新的一列，表示起始时间，split表示分别取开始和结束时间，用索引来指定\n",
    "        _df.insert(_idx_col + 1, _col + '_at',\n",
    "                   _df[_col].str.split('-').str[0])\n",
    "        # 添加新的一列，表示终止时间\n",
    "        _df.insert(_idx_col + 2, _col + '_bt',\n",
    "                   _df[_col].str.split('-').str[1])\n",
    "        # 删除持续时间\n",
    "        del _df[_col]\n",
    "        cols_timer = cols_timer + [_col + '_at', _col + '_bt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5_t</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7_t</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9_t</th>\n",
       "      <th>...</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9_at</th>\n",
       "      <th>B9_bt</th>\n",
       "      <th>B10_at</th>\n",
       "      <th>B10_bt</th>\n",
       "      <th>B11_at</th>\n",
       "      <th>B11_bt</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>13:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15:30:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>11:30</td>\n",
       "      <td>13:00</td>\n",
       "      <td>14:00</td>\n",
       "      <td>15:30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>6:00</td>\n",
       "      <td>7:30</td>\n",
       "      <td>7:30</td>\n",
       "      <td>9:00</td>\n",
       "      <td>9:00</td>\n",
       "      <td>10:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1:00</td>\n",
       "      <td>2:30</td>\n",
       "      <td>2:30</td>\n",
       "      <td>4:00</td>\n",
       "      <td>4:00</td>\n",
       "      <td>5:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>1:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>19:00</td>\n",
       "      <td>20:30</td>\n",
       "      <td>21:30</td>\n",
       "      <td>23:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>9:00</td>\n",
       "      <td>10:30</td>\n",
       "      <td>10:30</td>\n",
       "      <td>12:00</td>\n",
       "      <td>12:00</td>\n",
       "      <td>13:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4      A5_t    A6 A7_t  A8      A9_t  ...  \\\n",
       "0  sample_1528  300 NaN  405.0  700  13:30:00  38.0  NaN NaN  15:30:00  ...   \n",
       "1  sample_1698  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00  ...   \n",
       "2   sample_639  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00  ...   \n",
       "3   sample_483  300 NaN  405.0  700   1:30:00  38.0  NaN NaN   3:00:00  ...   \n",
       "4   sample_617  300 NaN  405.0  700  22:00:00  29.0  NaN NaN   0:00:00  ...   \n",
       "\n",
       "     B8  B9_at  B9_bt  B10_at B10_bt  B11_at B11_bt     B12   B13  B14  \n",
       "0  45.0  11:30  13:00   14:00  15:30     NaN    NaN   800.0  0.15  400  \n",
       "1  45.0   6:00   7:30    7:30   9:00    9:00  10:00  1200.0  0.15  400  \n",
       "2  45.0   1:00   2:30    2:30   4:00    4:00   5:00  1200.0  0.15  400  \n",
       "3  45.0  19:00  20:30   21:30  23:00     NaN    NaN   800.0  0.15  400  \n",
       "4  45.0   9:00  10:30   10:30  12:00   12:00  13:00  1200.0  0.15  420  \n",
       "\n",
       "[5 rows x 49 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn_tst.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_timer = list(filter(lambda x: x.endswith('t'), df_trn_tst.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A5_t',\n",
       " 'A7_t',\n",
       " 'A9_t',\n",
       " 'A11_t',\n",
       " 'A14_t',\n",
       " 'A16_t',\n",
       " 'A20_at',\n",
       " 'A20_bt',\n",
       " 'A24_t',\n",
       " 'A26_t',\n",
       " 'A28_at',\n",
       " 'A28_bt',\n",
       " 'B4_at',\n",
       " 'B4_bt',\n",
       " 'B5_t',\n",
       " 'B7_t',\n",
       " 'B9_at',\n",
       " 'B9_bt',\n",
       " 'B10_at',\n",
       " 'B10_bt',\n",
       " 'B11_at',\n",
       " 'B11_bt']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols_timer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_to_min(x):\n",
    "    # 将时间全部转换成分钟形式\n",
    "    if x is np.nan:\n",
    "        return np.nan\n",
    "    else:\n",
    "        x = x.replace(';', ':').replace('；', ':')\n",
    "        x = x.replace('::', ':').replace('\"', ':')\n",
    "        h, m = x.split(':')[:2]\n",
    "        h = 0 if not h else h\n",
    "        m = 0 if not m else m\n",
    "        return int(h)*60 + int(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "for _df in [df_trn_tst, df_trn, df_tst]:\n",
    "    for _col in cols_timer:\n",
    "        _df[_col] = _df[_col].map(time_to_min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5_t</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7_t</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9_t</th>\n",
       "      <th>...</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9_at</th>\n",
       "      <th>B9_bt</th>\n",
       "      <th>B10_at</th>\n",
       "      <th>B10_bt</th>\n",
       "      <th>B11_at</th>\n",
       "      <th>B11_bt</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>810</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>930</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>690</td>\n",
       "      <td>780</td>\n",
       "      <td>840.0</td>\n",
       "      <td>930.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>840</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>960</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>360</td>\n",
       "      <td>450</td>\n",
       "      <td>450.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>600.0</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>840</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>960</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>60</td>\n",
       "      <td>150</td>\n",
       "      <td>150.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>90</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1140</td>\n",
       "      <td>1230</td>\n",
       "      <td>1290.0</td>\n",
       "      <td>1380.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>1320</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>540</td>\n",
       "      <td>630</td>\n",
       "      <td>630.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4  A5_t    A6  A7_t  A8  A9_t  ...    B8  \\\n",
       "0  sample_1528  300 NaN  405.0  700   810  38.0   NaN NaN   930  ...  45.0   \n",
       "1  sample_1698  300 NaN  405.0  700   840  29.0   NaN NaN   960  ...  45.0   \n",
       "2   sample_639  300 NaN  405.0  700   840  29.0   NaN NaN   960  ...  45.0   \n",
       "3   sample_483  300 NaN  405.0  700    90  38.0   NaN NaN   180  ...  45.0   \n",
       "4   sample_617  300 NaN  405.0  700  1320  29.0   NaN NaN     0  ...  45.0   \n",
       "\n",
       "   B9_at  B9_bt  B10_at  B10_bt  B11_at  B11_bt     B12   B13  B14  \n",
       "0    690    780   840.0   930.0     NaN     NaN   800.0  0.15  400  \n",
       "1    360    450   450.0   540.0   540.0   600.0  1200.0  0.15  400  \n",
       "2     60    150   150.0   240.0   240.0   300.0  1200.0  0.15  400  \n",
       "3   1140   1230  1290.0  1380.0     NaN     NaN   800.0  0.15  400  \n",
       "4    540    630   630.0   720.0   720.0   780.0  1200.0  0.15  420  \n",
       "\n",
       "[5 rows x 49 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn_tst.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建一个df来准备添加很多特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id\n",
       "0  sample_1528\n",
       "1  sample_1698\n",
       "2   sample_639\n",
       "3   sample_483\n",
       "4   sample_617"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 温度相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加热过程\n",
    "df['P1_S1_A6_0C'] = raw['A6']  # 容器初始温度\n",
    "df['P1_S2_A8_1C'] = raw['A8']  # 首次测温温度\n",
    "df['P1_S3_A10_2C'] = raw['A10']  # 准备水解温度\n",
    "df['P1_C1_C0_D'] = raw['A8'] - raw['A6']  # 测温温差\n",
    "df['P1_C2_C0_D'] = raw['A10'] - raw['A6']  # 初次沸腾温差\n",
    "\n",
    "# 水解过程\n",
    "df['P2_S1_A12_3C'] = raw['A12']  # 水解开始温度\n",
    "df['P2_S2_A15_4C'] = raw['A15']  # 水解过程测温温度\n",
    "df['P2_S3_A17_5C'] = raw['A17']  # 水解结束温度\n",
    "df['P2_C3_C0_D'] = raw['A12'] - raw['A6']  # 水解开始与初始温度温差\n",
    "df['P2_C3_C2_D'] = raw['A12'] - raw['A10']  # 水解开始前恒温温差\n",
    "df['P2_C4_C3_D'] = raw['A15'] - raw['A12']  # 水解过程中途温差\n",
    "df['P2_C5_C4_D'] = raw['A17'] - raw['A15']  # 水解结束中途温差\n",
    "df['P2_C5_C3_KD'] = raw['A17'] - raw['A12']  # 水解起止温差\n",
    "\n",
    "# 脱色过程\n",
    "df['P3_S2_A25_7C'] = raw['A25']  # 脱色保温开始温度\n",
    "df['P3_S3_A27_8C'] = raw['A27']  # 脱色保温结束温度\n",
    "df['P3_C7_C5_D'] = raw['A25'] - raw['A17']  # 降温温差\n",
    "df['P3_C8_C7_KD'] = raw['A27'] - raw['A25']  # 保温温差\n",
    "\n",
    "# 结晶过程\n",
    "df['P4_S2_B6_11C'] = raw['B6']  # 结晶开始温度\n",
    "df['P4_S3_B8_12C'] = raw['B8']  # 结晶结束温度\n",
    "df['P4_C11_C8_D'] = raw['B6'] - raw['A27']  # 脱色结束到结晶温差\n",
    "df['P4_C12_C11_KD'] = raw['B8'] - raw['B6']  # 结晶温差"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 温度相关统计特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "_funcs = ['mean', 'std', 'sum']\n",
    "# 遍历每一种统计指标\n",
    "for _func in _funcs:\n",
    "    # 对每一个样本计算各项指标\n",
    "    df[f'P2_C2-C5_{_func}'] = raw[['A10', 'A12', 'A15', 'A17']].\\\n",
    "        agg(_func, axis=1)  # 沸腾过程温度\n",
    "    df[f'P2_D3-D5_{_func}'] = \\\n",
    "        df[[f'P2_C{i}_C{i-1}_D' for i in range(3, 6)]].\\\n",
    "            abs().agg(_func, axis=1)  # 沸腾过程绝对温差\n",
    "    df[f'P2_C1-C12_KD_ABS_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('KD')]].\\\n",
    "            abs().agg(_func, axis=1)  # 关键过程绝对温差\n",
    "    df[f'P2_C1-C12_D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('D')]].\\\n",
    "            abs().agg(_func, axis=1)  # 所有过程绝对温差\n",
    "    df[f'P2_LARGE_KD_{_func}'] = \\\n",
    "        df[['P2_C3_C0_D', 'P3_C7_C5_D', 'P4_C12_C11_KD']].\\\n",
    "            abs().agg(_func, axis=1)  # 大温差绝对温差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>P1_S1_A6_0C</th>\n",
       "      <th>P1_S2_A8_1C</th>\n",
       "      <th>P1_S3_A10_2C</th>\n",
       "      <th>P1_C1_C0_D</th>\n",
       "      <th>P1_C2_C0_D</th>\n",
       "      <th>P2_S1_A12_3C</th>\n",
       "      <th>P2_S2_A15_4C</th>\n",
       "      <th>P2_S3_A17_5C</th>\n",
       "      <th>P2_C3_C0_D</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C2-C5_std</th>\n",
       "      <th>P2_D3-D5_std</th>\n",
       "      <th>P2_C1-C12_KD_ABS_std</th>\n",
       "      <th>P2_C1-C12_D_std</th>\n",
       "      <th>P2_LARGE_KD_std</th>\n",
       "      <th>P2_C2-C5_sum</th>\n",
       "      <th>P2_D3-D5_sum</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.643651</td>\n",
       "      <td>24.928565</td>\n",
       "      <td>23.245071</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.290994</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>18.009257</td>\n",
       "      <td>29.231642</td>\n",
       "      <td>25.514702</td>\n",
       "      <td>414.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.165151</td>\n",
       "      <td>24.617293</td>\n",
       "      <td>22.479620</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id  P1_S1_A6_0C  P1_S2_A8_1C  P1_S3_A10_2C  P1_C1_C0_D  \\\n",
       "0  sample_1528         38.0          NaN           100         NaN   \n",
       "1  sample_1698         29.0          NaN           101         NaN   \n",
       "2   sample_639         29.0          NaN           102         NaN   \n",
       "3   sample_483         38.0          NaN           100         NaN   \n",
       "4   sample_617         29.0          NaN           101         NaN   \n",
       "\n",
       "   P1_C2_C0_D  P2_S1_A12_3C  P2_S2_A15_4C  P2_S3_A17_5C  P2_C3_C0_D  ...  \\\n",
       "0        62.0         102.0         103.0         104.0        64.0  ...   \n",
       "1        72.0         103.0         104.0         105.0        74.0  ...   \n",
       "2        73.0         103.0         104.0         105.0        74.0  ...   \n",
       "3        62.0         102.0         103.0         104.0        64.0  ...   \n",
       "4        72.0         103.0         104.0         105.0        74.0  ...   \n",
       "\n",
       "   P2_C2-C5_std  P2_D3-D5_std  P2_C1-C12_KD_ABS_std  P2_C1-C12_D_std  \\\n",
       "0      1.707825       0.57735              9.643651        24.928565   \n",
       "1      1.707825       0.57735             17.785762        28.887521   \n",
       "2      1.290994       0.00000             18.009257        29.231642   \n",
       "3      1.707825       0.57735              9.165151        24.617293   \n",
       "4      1.707825       0.57735             17.785762        28.887521   \n",
       "\n",
       "   P2_LARGE_KD_std  P2_C2-C5_sum  P2_D3-D5_sum  P2_C1-C12_KD_ABS_sum  \\\n",
       "0        23.245071         409.0           4.0                  27.0   \n",
       "1        25.890796         413.0           4.0                  44.0   \n",
       "2        25.514702         414.0           3.0                  43.0   \n",
       "3        22.479620         409.0           4.0                  30.0   \n",
       "4        25.890796         413.0           4.0                  44.0   \n",
       "\n",
       "   P2_C1-C12_D_sum  P2_LARGE_KD_sum  \n",
       "0            191.0            113.0  \n",
       "1            226.0            134.0  \n",
       "2            226.0            135.0  \n",
       "3            207.0            118.0  \n",
       "4            226.0            134.0  \n",
       "\n",
       "[5 rows x 37 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_temperature = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>P1_S1_A6_0C</th>\n",
       "      <th>P1_S2_A8_1C</th>\n",
       "      <th>P1_S3_A10_2C</th>\n",
       "      <th>P1_C1_C0_D</th>\n",
       "      <th>P1_C2_C0_D</th>\n",
       "      <th>P2_S1_A12_3C</th>\n",
       "      <th>P2_S2_A15_4C</th>\n",
       "      <th>P2_S3_A17_5C</th>\n",
       "      <th>P2_C3_C0_D</th>\n",
       "      <th>P2_C3_C2_D</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C2-C5_std</th>\n",
       "      <th>P2_D3-D5_std</th>\n",
       "      <th>P2_C1-C12_KD_ABS_std</th>\n",
       "      <th>P2_C1-C12_D_std</th>\n",
       "      <th>P2_LARGE_KD_std</th>\n",
       "      <th>P2_C2-C5_sum</th>\n",
       "      <th>P2_D3-D5_sum</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>样本id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>sample_1528</th>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.643651</td>\n",
       "      <td>24.928565</td>\n",
       "      <td>23.245071</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_1698</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_639</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.290994</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>18.009257</td>\n",
       "      <td>29.231642</td>\n",
       "      <td>25.514702</td>\n",
       "      <td>414.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_483</th>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.165151</td>\n",
       "      <td>24.617293</td>\n",
       "      <td>22.479620</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_617</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             P1_S1_A6_0C  P1_S2_A8_1C  P1_S3_A10_2C  P1_C1_C0_D  P1_C2_C0_D  \\\n",
       "样本id                                                                          \n",
       "sample_1528         38.0          NaN           100         NaN        62.0   \n",
       "sample_1698         29.0          NaN           101         NaN        72.0   \n",
       "sample_639          29.0          NaN           102         NaN        73.0   \n",
       "sample_483          38.0          NaN           100         NaN        62.0   \n",
       "sample_617          29.0          NaN           101         NaN        72.0   \n",
       "\n",
       "             P2_S1_A12_3C  P2_S2_A15_4C  P2_S3_A17_5C  P2_C3_C0_D  P2_C3_C2_D  \\\n",
       "样本id                                                                            \n",
       "sample_1528         102.0         103.0         104.0        64.0         2.0   \n",
       "sample_1698         103.0         104.0         105.0        74.0         2.0   \n",
       "sample_639          103.0         104.0         105.0        74.0         1.0   \n",
       "sample_483          102.0         103.0         104.0        64.0         2.0   \n",
       "sample_617          103.0         104.0         105.0        74.0         2.0   \n",
       "\n",
       "             ...  P2_C2-C5_std  P2_D3-D5_std  P2_C1-C12_KD_ABS_std  \\\n",
       "样本id         ...                                                     \n",
       "sample_1528  ...      1.707825       0.57735              9.643651   \n",
       "sample_1698  ...      1.707825       0.57735             17.785762   \n",
       "sample_639   ...      1.290994       0.00000             18.009257   \n",
       "sample_483   ...      1.707825       0.57735              9.165151   \n",
       "sample_617   ...      1.707825       0.57735             17.785762   \n",
       "\n",
       "             P2_C1-C12_D_std  P2_LARGE_KD_std  P2_C2-C5_sum  P2_D3-D5_sum  \\\n",
       "样本id                                                                        \n",
       "sample_1528        24.928565        23.245071         409.0           4.0   \n",
       "sample_1698        28.887521        25.890796         413.0           4.0   \n",
       "sample_639         29.231642        25.514702         414.0           3.0   \n",
       "sample_483         24.617293        22.479620         409.0           4.0   \n",
       "sample_617         28.887521        25.890796         413.0           4.0   \n",
       "\n",
       "             P2_C1-C12_KD_ABS_sum  P2_C1-C12_D_sum  P2_LARGE_KD_sum  \n",
       "样本id                                                                 \n",
       "sample_1528                  27.0            191.0            113.0  \n",
       "sample_1698                  44.0            226.0            134.0  \n",
       "sample_639                   43.0            226.0            135.0  \n",
       "sample_483                   30.0            207.0            118.0  \n",
       "sample_617                   44.0            226.0            134.0  \n",
       "\n",
       "[5 rows x 36 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_temperature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 时间相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 时间计算方式转换\n",
    "def duration_outer(series1, series2):\n",
    "    # 处理隔了一天如21：30 - 01：30\n",
    "    duration = series1 - series2\n",
    "    duration = np.where(duration < 0, duration + 24*60, duration)\n",
    "    duration = np.where(duration > 12*60, 24*60 - duration, duration)\n",
    "    duration = np.where(duration > 6*60, 12*60 - duration, duration)\n",
    "    return duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "# 加热过程\n",
    "df['P1_S1_A5_0T'] = raw['A5_t']  # 初始时刻\n",
    "df['P1_S2_A9_2T'] = raw['A9_t']  # 初始时刻\n",
    "df['P1_T1_T0_D'] = duration_outer(raw['A7_t'], raw['A5_t'])\n",
    "# 初次测温时间差\n",
    "df['P1_T2_T1_D'] = duration_outer(raw['A9_t'], raw['A7_t'])\n",
    "# 二次测温时间差\n",
    "df['P1_T2_T0_K_D'] = duration_outer(raw['A9_t'], raw['A5_t'])\n",
    "# 开始加热至沸腾时间差\n",
    "\n",
    "# 水解过程\n",
    "df['P2_S1_A11_3T'] = raw['A11_t']  # 水解开始时刻\n",
    "df['P2_S1_A16_5T'] = raw['A16_t']  # 水解结束时刻\n",
    "\n",
    "df['P2_T3_T0_K_D'] = duration_outer(raw['A11_t'], raw['A5_t'])\n",
    "# 开始加热至投料时间差\n",
    "df['P2_T3_T2_K_D'] = duration_outer(raw['A11_t'], raw['A9_t'])\n",
    "# 恒温至投料投料时间差\n",
    "# df['P2_T4_T3_D'] = raw['A14_t'] - raw['A11_t']  # 水解初次测温时间差\n",
    "# df['P2_T5_T4_D'] = raw['A16_t'] - raw['A14_t']  # 水解结束时间差\n",
    "df['P2_T5_T3_K_D'] = duration_outer(raw['A16_t'], raw['A11_t'])\n",
    "# 水解时间差\n",
    "\n",
    "# 脱色过程\n",
    "df['P3_S1_A20_6T'] = raw['A20_at']  # 中和开始时刻\n",
    "df['P3_S2_A25_7T'] = raw['A24_t']  # 保温时刻\n",
    "\n",
    "df['P3_T6_T5_K_D'] = duration_outer(raw['A20_at'], raw['A16_t'])\n",
    "# 水解结束至中和间歇时间\n",
    "df['P3_T6_T6_K_D'] = duration_outer(raw['A20_bt'], raw['A20_at'])\n",
    "# 酸碱度中和时间\n",
    "df['P3_T7_T6_D'] = duration_outer(raw['A24_t'], raw['A20_bt'])\n",
    "# 中和结束至脱色间歇时间\n",
    "df['P3_T8_T7_K_D'] = duration_outer(raw['A26_t'], raw['A24_t'])\n",
    "# 脱色保温时间\n",
    "df['P3_T9_T8_D'] = duration_outer(raw['A28_at'], raw['A26_t'])\n",
    "# 脱色至抽滤间歇时间\n",
    "df['P3_T9_T9_K_D'] = duration_outer(raw['A28_bt'], raw['A28_at'])\n",
    "# 抽滤时间\n",
    "df['P3_T9_T5_1D'] = duration_outer(raw['A28_bt'], raw['A16_t'])\n",
    "df['P3_T9_T6_2D'] = duration_outer(raw['A28_bt'], raw['A20_at'])\n",
    "# 脱色总时间\n",
    "\n",
    "# 结晶过程\n",
    "df['P4_S1_B4_10T'] = raw['B4_at']  # 酸化开始时刻\n",
    "df['P4_S2_B5_11T'] = raw['B5_t']  # 结晶开始时刻\n",
    "df['P4_S3_B7_12T'] = raw['B7_t']  # 结晶结束时刻\n",
    "\n",
    "df['P4_T10_T9_D'] = duration_outer(raw['B4_at'], raw['A28_bt'])\n",
    "# 抽滤结束至酸化间歇时间\n",
    "df['P4_T10_T10_K_D'] = duration_outer(raw['B4_bt'], raw['B4_at'])\n",
    "# 酸化时间\n",
    "df['P4_T11_T10_K_D'] = duration_outer(raw['B5_t'], raw['B4_bt'])\n",
    "# 酸化至结晶间歇时间\n",
    "df['P4_T12_T11_K_D'] = duration_outer(raw['B7_t'], raw['B5_t'])\n",
    "# 自然结晶时间\n",
    "df['P4_T12_T9_1D'] = duration_outer(raw['B7_t'], raw['A28_bt'])\n",
    "df['P4_T12_T10_2D'] = duration_outer(raw['B7_t'], raw['B4_at'])\n",
    "# 结晶总时间\n",
    "\n",
    "# 甩滤过程\n",
    "df['P5_S1_B9_13T'] = raw['B9_at']  # 甩滤开始时刻\n",
    "df['P5_S3_B12_15T'] = np.where(\n",
    "    raw['B11_bt'].isnull(),\n",
    "    np.where(raw['B10_bt'].isnull(), raw['B9_bt'], raw['B10_bt']),\n",
    "    raw['B11_bt'])  # 甩滤结束时刻\n",
    "df['P5_T13_T12_D'] = duration_outer(raw['B9_at'], raw['B7_t'])\n",
    "# 酸化结束至甩滤间歇时间\n",
    "df['P5_T13_T13_K_D'] = duration_outer(raw['B9_bt'], raw['B9_at'])\n",
    "# 基本甩滤时间\n",
    "df['P5_T14_T13_D'] = duration_outer(raw['B10_at'], raw['B9_bt'])\n",
    "# 基本甩滤至补充甩滤1间歇时间\n",
    "df['P5_T14_T14_K_D'] = duration_outer(raw['B10_bt'], raw['B10_at'])\n",
    "# 补充甩滤1时间\n",
    "df['P5_T15_T14_D'] = duration_outer(raw['B11_at'], raw['B10_bt'])\n",
    "# 补充甩滤1至补充甩滤2间歇时间\n",
    "df['P5_T15_T13_K_D'] = duration_outer(raw['B11_bt'], raw['B11_at'])\n",
    "# 补充甩滤2时间\n",
    "df['P5_T15_T13_1D'] = \\\n",
    "    df[['P5_T13_T13_K_D', 'P5_T14_T14_K_D', 'P5_T13_T13_K_D']].sum(axis=1)\n",
    "df['P5_T15_T12_2D'] = duration_outer(\n",
    "    df['P5_S3_B12_15T'], df['P4_S3_B7_12T'])\n",
    "df['P5_T15_T12_3D'] = duration_outer(\n",
    "    df['P5_S3_B12_15T'], df['P5_S1_B9_13T'])\n",
    "# 总甩滤时间\n",
    "\n",
    "# 总流程时长\n",
    "df['P5_T15_T1_4D'] = \\\n",
    "    df[['P5_T15_T12_2D', 'P4_T12_T9_1D', 'P3_T9_T5_1D',\n",
    "        'P2_T3_T0_K_D', 'P2_T5_T3_K_D']].sum(axis=1)\n",
    "_funcs = ['mean', 'std', 'sum']\n",
    "for _func in _funcs:\n",
    "    df[f'P5__D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('_D')]].\\\n",
    "            abs().agg(_func, axis=1)\n",
    "    df[f'P5_K_D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('_K_D')]]. \\\n",
    "            abs().agg(_func, axis=1)\n",
    "    df[f'P5__D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('D')]]. \\\n",
    "            abs().agg(_func, axis=1)\n",
    "df_duration = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>P1_S1_A5_0T</th>\n",
       "      <th>P1_S2_A9_2T</th>\n",
       "      <th>P1_T1_T0_D</th>\n",
       "      <th>P1_T2_T1_D</th>\n",
       "      <th>P1_T2_T0_K_D</th>\n",
       "      <th>P2_S1_A11_3T</th>\n",
       "      <th>P2_S1_A16_5T</th>\n",
       "      <th>P2_T3_T0_K_D</th>\n",
       "      <th>P2_T3_T2_K_D</th>\n",
       "      <th>P2_T5_T3_K_D</th>\n",
       "      <th>...</th>\n",
       "      <th>P5_T15_T13_1D</th>\n",
       "      <th>P5_T15_T12_2D</th>\n",
       "      <th>P5_T15_T12_3D</th>\n",
       "      <th>P5_T15_T1_4D</th>\n",
       "      <th>P5__D_mean</th>\n",
       "      <th>P5_K_D_mean</th>\n",
       "      <th>P5__D_std</th>\n",
       "      <th>P5_K_D_std</th>\n",
       "      <th>P5__D_sum</th>\n",
       "      <th>P5_K_D_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>样本id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>sample_1528</th>\n",
       "      <td>810</td>\n",
       "      <td>930</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>990</td>\n",
       "      <td>1110</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>840.0</td>\n",
       "      <td>145.384615</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>169.852425</td>\n",
       "      <td>63.639610</td>\n",
       "      <td>3780.0</td>\n",
       "      <td>1170.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_1698</th>\n",
       "      <td>840</td>\n",
       "      <td>960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>1020</td>\n",
       "      <td>1140</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>960.0</td>\n",
       "      <td>136.071429</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>188.588113</td>\n",
       "      <td>76.258669</td>\n",
       "      <td>3810.0</td>\n",
       "      <td>1260.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_639</th>\n",
       "      <td>840</td>\n",
       "      <td>960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>1020</td>\n",
       "      <td>1140</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>900.0</td>\n",
       "      <td>123.214286</td>\n",
       "      <td>75.000000</td>\n",
       "      <td>173.654693</td>\n",
       "      <td>49.575118</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>1050.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_483</th>\n",
       "      <td>90</td>\n",
       "      <td>180</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90</td>\n",
       "      <td>240</td>\n",
       "      <td>360</td>\n",
       "      <td>150</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>990.0</td>\n",
       "      <td>158.076923</td>\n",
       "      <td>73.846154</td>\n",
       "      <td>195.448596</td>\n",
       "      <td>46.822086</td>\n",
       "      <td>4110.0</td>\n",
       "      <td>960.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_617</th>\n",
       "      <td>1320</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>60</td>\n",
       "      <td>180</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>900.0</td>\n",
       "      <td>123.214286</td>\n",
       "      <td>77.142857</td>\n",
       "      <td>173.846539</td>\n",
       "      <td>48.107024</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>1080.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 47 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             P1_S1_A5_0T  P1_S2_A9_2T  P1_T1_T0_D  P1_T2_T1_D  P1_T2_T0_K_D  \\\n",
       "样本id                                                                          \n",
       "sample_1528          810          930         NaN         NaN           120   \n",
       "sample_1698          840          960         NaN         NaN           120   \n",
       "sample_639           840          960         NaN         NaN           120   \n",
       "sample_483            90          180         NaN         NaN            90   \n",
       "sample_617          1320            0         NaN         NaN           120   \n",
       "\n",
       "             P2_S1_A11_3T  P2_S1_A16_5T  P2_T3_T0_K_D  P2_T3_T2_K_D  \\\n",
       "样本id                                                                  \n",
       "sample_1528           990          1110           180            60   \n",
       "sample_1698          1020          1140           180            60   \n",
       "sample_639           1020          1140           180            60   \n",
       "sample_483            240           360           150            60   \n",
       "sample_617             60           180           180            60   \n",
       "\n",
       "             P2_T5_T3_K_D  ...  P5_T15_T13_1D  P5_T15_T12_2D  P5_T15_T12_3D  \\\n",
       "样本id                       ...                                                \n",
       "sample_1528           120  ...          270.0          240.0          240.0   \n",
       "sample_1698           120  ...          270.0          240.0          240.0   \n",
       "sample_639            120  ...          270.0          240.0          240.0   \n",
       "sample_483            120  ...          270.0          300.0          240.0   \n",
       "sample_617            120  ...          270.0          240.0          240.0   \n",
       "\n",
       "             P5_T15_T1_4D  P5__D_mean  P5_K_D_mean   P5__D_std  P5_K_D_std  \\\n",
       "样本id                                                                         \n",
       "sample_1528         840.0  145.384615    90.000000  169.852425   63.639610   \n",
       "sample_1698         960.0  136.071429    90.000000  188.588113   76.258669   \n",
       "sample_639          900.0  123.214286    75.000000  173.654693   49.575118   \n",
       "sample_483          990.0  158.076923    73.846154  195.448596   46.822086   \n",
       "sample_617          900.0  123.214286    77.142857  173.846539   48.107024   \n",
       "\n",
       "             P5__D_sum  P5_K_D_sum  \n",
       "样本id                                \n",
       "sample_1528     3780.0      1170.0  \n",
       "sample_1698     3810.0      1260.0  \n",
       "sample_639      3450.0      1050.0  \n",
       "sample_483      4110.0       960.0  \n",
       "sample_617      3450.0      1080.0  \n",
       "\n",
       "[5 rows x 47 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_duration.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 水耗相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "na_value=405\n",
    "\n",
    "df_trn_tst = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "# 耗水\n",
    "df['P2_W_1M'] = raw['A4']\n",
    "df['P2_W_2M'] = raw['A19']\n",
    "# 耗盐酸\n",
    "df['P3_H_1M'] = raw['A21'].fillna(50)\n",
    "df['P4_H_2M'] = raw['B1'].fillna(320)\n",
    "# 氢氧化钠\n",
    "df['P2_N_1M'] = raw['A3'].fillna(na_value)\n",
    "# 4-氰基吡啶\n",
    "df['P2_C_1M'] = raw['A1']\n",
    "\n",
    "df['P5_W_3M'] = raw['B12'].fillna(1200)\n",
    "df['P5_W_1M'] = df['P2_W_1M'] + df['P2_W_2M']\n",
    "df['P5_W_3M'] = df['P2_W_1M'] + df['P2_W_2M'] + df['P5_W_3M']\n",
    "df['P5_H_1M'] = df['P3_H_1M'] + df['P4_H_2M']\n",
    "df['P5_M_0M'] = raw['A1'] + df['P2_N_1M'] + df['P5_W_1M'] + df['P4_H_2M']\n",
    "df['P5_M_1M'] = df['P5_M_0M'] + df['P5_W_3M']\n",
    "df['P5_M_2M'] = df['P5_M_1M'] + df['P3_H_1M']\n",
    "# 理论产出\n",
    "df['P5_O_1M'] = raw['B14']\n",
    "df['P5_O_5M'] = raw['B14'].replace(418, 420).replace(405, 400).\\\n",
    "    replace(395, 390).replace(392, 390).replace(387, 380).\\\n",
    "    replace(385, 380).replace(370, 360).replace(350, 360).\\\n",
    "    replace(350, 360).replace(340, 360).replace(290, 280).\\\n",
    "    replace(260, 280).replace(256, 280)\n",
    "_fs = [_f for _f in df.columns if _f.endswith('M')]\n",
    "for _f in _fs[:-2]:\n",
    "    df[f'{_f}_P5_O_1M_R'] = df['P5_O_1M'] / df[_f]\n",
    "    df[f'{_f}_P5_O_5M_R'] = df['P5_O_5M'] / df[_f]\n",
    "for i in range(len(_fs[:6])):\n",
    "    _f, _sub_fs = _fs[i], _fs[(i+1):6]\n",
    "    for _f_div in _sub_fs:\n",
    "        df[f'{_f}_{_f_div}_R'] = df[_f] / df[_f_div]\n",
    "df_materials = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "df['P5_NOT_NUM_N'] = raw.iloc[:, 1:-1].notnull().sum(axis=1)\n",
    "df['P5_PH_1N'] = raw['A22']\n",
    "df['P5_PH_2N'] = raw['A23']\n",
    "df['P5_PH_2N'] = raw['B2']\n",
    "df['P5_A7_1N'] = raw['A7_t'].isnull().astype(int)\n",
    "df['P5_O_2M'] = (raw['B14'] <= 360).astype(int)\n",
    "df['P5_1_3M'] = raw['B13']\n",
    "df_interact = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>P5_NOT_NUM_N</th>\n",
       "      <th>P5_PH_1N</th>\n",
       "      <th>P5_PH_2N</th>\n",
       "      <th>P5_A7_1N</th>\n",
       "      <th>P5_O_2M</th>\n",
       "      <th>P5_1_3M</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>样本id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>sample_1528</th>\n",
       "      <td>42</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_1698</th>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_639</th>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_483</th>\n",
       "      <td>42</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_617</th>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             P5_NOT_NUM_N  P5_PH_1N  P5_PH_2N  P5_A7_1N  P5_O_2M  P5_1_3M\n",
       "样本id                                                                     \n",
       "sample_1528            42       9.0       3.5         1        0     0.15\n",
       "sample_1698            44       9.0       3.5         1        0     0.15\n",
       "sample_639             44       9.0       3.5         1        0     0.15\n",
       "sample_483             42      10.0       3.5         1        0     0.15\n",
       "sample_617             44       9.0       3.5         1        0     0.15"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_interact.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 合并所有特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_feature = pd.concat([df_materials,\n",
    "                        df_duration,\n",
    "                        df_temperature,\n",
    "                        df_interact], axis=1).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trn = df_feature.iloc[:len(df_trn)].reset_index(drop=True)\n",
    "df_trn['收率'] = df_target\n",
    "df_tst = df_feature.iloc[len(df_trn):].reset_index(drop=True)\n",
    "df_tst['收率'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>P2_W_1M</th>\n",
       "      <th>P2_W_2M</th>\n",
       "      <th>P3_H_1M</th>\n",
       "      <th>P4_H_2M</th>\n",
       "      <th>P2_N_1M</th>\n",
       "      <th>P2_C_1M</th>\n",
       "      <th>P5_W_3M</th>\n",
       "      <th>P5_W_1M</th>\n",
       "      <th>P5_H_1M</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "      <th>P5_NOT_NUM_N</th>\n",
       "      <th>P5_PH_1N</th>\n",
       "      <th>P5_PH_2N</th>\n",
       "      <th>P5_A7_1N</th>\n",
       "      <th>P5_O_2M</th>\n",
       "      <th>P5_1_3M</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>700</td>\n",
       "      <td>300</td>\n",
       "      <td>50.0</td>\n",
       "      <td>350.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1800.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>400.0</td>\n",
       "      <td>...</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "      <td>42</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>370.0</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>370.0</td>\n",
       "      <td>...</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>290.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1700.0</td>\n",
       "      <td>900</td>\n",
       "      <td>340.0</td>\n",
       "      <td>...</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "      <td>42</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>370.0</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 144 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id  P2_W_1M  P2_W_2M  P3_H_1M  P4_H_2M  P2_N_1M  P2_C_1M  P5_W_3M  \\\n",
       "0  sample_1528      700      300     50.0    350.0    405.0      300   1800.0   \n",
       "1  sample_1698      700      200     50.0    320.0    405.0      300   2100.0   \n",
       "2   sample_639      700      200     50.0    320.0    405.0      300   2100.0   \n",
       "3   sample_483      700      200     50.0    290.0    405.0      300   1700.0   \n",
       "4   sample_617      700      200     50.0    320.0    405.0      300   2100.0   \n",
       "\n",
       "   P5_W_1M  P5_H_1M  ...  P2_C1-C12_KD_ABS_sum  P2_C1-C12_D_sum  \\\n",
       "0     1000    400.0  ...                  27.0            191.0   \n",
       "1      900    370.0  ...                  44.0            226.0   \n",
       "2      900    370.0  ...                  43.0            226.0   \n",
       "3      900    340.0  ...                  30.0            207.0   \n",
       "4      900    370.0  ...                  44.0            226.0   \n",
       "\n",
       "   P2_LARGE_KD_sum  P5_NOT_NUM_N  P5_PH_1N  P5_PH_2N  P5_A7_1N  P5_O_2M  \\\n",
       "0            113.0            42       9.0       3.5         1        0   \n",
       "1            134.0            44       9.0       3.5         1        0   \n",
       "2            135.0            44       9.0       3.5         1        0   \n",
       "3            118.0            42      10.0       3.5         1        0   \n",
       "4            134.0            44       9.0       3.5         1        0   \n",
       "\n",
       "   P5_1_3M     收率  \n",
       "0     0.15  0.879  \n",
       "1     0.15  0.902  \n",
       "2     0.15  0.936  \n",
       "3     0.15  0.902  \n",
       "4     0.15  0.983  \n",
       "\n",
       "[5 rows x 144 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "for _df in [df_trn, df_tst]:\n",
    "    _df.insert(1, 'id', _df['样本id'].str.split('_').str[1].astype(float))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>id</th>\n",
       "      <th>P2_W_1M</th>\n",
       "      <th>P2_W_2M</th>\n",
       "      <th>P3_H_1M</th>\n",
       "      <th>P4_H_2M</th>\n",
       "      <th>P2_N_1M</th>\n",
       "      <th>P2_C_1M</th>\n",
       "      <th>P5_W_3M</th>\n",
       "      <th>P5_W_1M</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "      <th>P5_NOT_NUM_N</th>\n",
       "      <th>P5_PH_1N</th>\n",
       "      <th>P5_PH_2N</th>\n",
       "      <th>P5_A7_1N</th>\n",
       "      <th>P5_O_2M</th>\n",
       "      <th>P5_1_3M</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>1528.0</td>\n",
       "      <td>700</td>\n",
       "      <td>300</td>\n",
       "      <td>50.0</td>\n",
       "      <td>350.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1800.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>...</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "      <td>42</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>1698.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>639.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>483.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>290.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1700.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "      <td>42</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>617.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 145 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id      id  P2_W_1M  P2_W_2M  P3_H_1M  P4_H_2M  P2_N_1M  P2_C_1M  \\\n",
       "0  sample_1528  1528.0      700      300     50.0    350.0    405.0      300   \n",
       "1  sample_1698  1698.0      700      200     50.0    320.0    405.0      300   \n",
       "2   sample_639   639.0      700      200     50.0    320.0    405.0      300   \n",
       "3   sample_483   483.0      700      200     50.0    290.0    405.0      300   \n",
       "4   sample_617   617.0      700      200     50.0    320.0    405.0      300   \n",
       "\n",
       "   P5_W_3M  P5_W_1M  ...  P2_C1-C12_KD_ABS_sum  P2_C1-C12_D_sum  \\\n",
       "0   1800.0     1000  ...                  27.0            191.0   \n",
       "1   2100.0      900  ...                  44.0            226.0   \n",
       "2   2100.0      900  ...                  43.0            226.0   \n",
       "3   1700.0      900  ...                  30.0            207.0   \n",
       "4   2100.0      900  ...                  44.0            226.0   \n",
       "\n",
       "   P2_LARGE_KD_sum  P5_NOT_NUM_N  P5_PH_1N  P5_PH_2N  P5_A7_1N  P5_O_2M  \\\n",
       "0            113.0            42       9.0       3.5         1        0   \n",
       "1            134.0            44       9.0       3.5         1        0   \n",
       "2            135.0            44       9.0       3.5         1        0   \n",
       "3            118.0            42      10.0       3.5         1        0   \n",
       "4            134.0            44       9.0       3.5         1        0   \n",
       "\n",
       "   P5_1_3M     收率  \n",
       "0     0.15  0.879  \n",
       "1     0.15  0.902  \n",
       "2     0.15  0.936  \n",
       "3     0.15  0.902  \n",
       "4     0.15  0.983  \n",
       "\n",
       "[5 rows x 145 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAATvElEQVR4nO3df7RlZX3f8fcHRkStOCADZc2gF5PRQLqKTCeW1jYmkib8qA6mkmLTOqVTp0lomqy0q46xq7FZSRes1YqhzSIhYjLQoEGtYRpoUhxBV7MKeJHfIDIihXGocyM/jKISyLd/nOduDzN3ZvYwd99zmHm/1jrrPPvZz973e/c98Jln73P2SVUhSRLAYZMuQJI0PQwFSVLHUJAkdQwFSVLHUJAkdZZNuoADceyxx9bMzMyky5CkF5Xbbrvtz6pqxULrXtShMDMzw+zs7KTLkKQXlST/d0/rPH0kSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeq8qD/RLGl6zGy6bmI/++GLzpnYzz7YOFOQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSZ9BQSLI8ySeSfDHJ/Un+VpJjktyQ5MH2fHQbmySXJtmW5K4ka4asTZK0u6FnCr8B/HFV/QBwKnA/sAnYWlWrga1tGeAsYHV7bAQuG7g2SdIuBguFJEcBPwxcAVBVz1TVk8A6YHMbthk4t7XXAVfWyM3A8iQnDFWfJGl3Q84UXgfMAb+b5PYkH07yCuD4qnoMoD0f18avBB4d235763ueJBuTzCaZnZubG7B8STr0DBkKy4A1wGVVdRrwLb53qmghWaCvduuouryq1lbV2hUrVixOpZIkYNhQ2A5sr6pb2vInGIXE1+ZPC7XnnWPjTxzbfhWwY8D6JEm7GCwUqur/AY8meUPrOgO4D9gCrG9964FrW3sL8O72LqTTgafmTzNJkpbGsoH3//PA7yc5AngIuIBREF2TZAPwCHBeG3s9cDawDXi6jZUkLaFBQ6Gq7gDWLrDqjAXGFnDhkPVIkvbOTzRLkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpM2goJHk4yd1J7kgy2/qOSXJDkgfb89GtP0kuTbItyV1J1gxZmyRpd0sxU/jRqnpjVa1ty5uArVW1GtjalgHOAla3x0bgsiWoTZI0ZhKnj9YBm1t7M3DuWP+VNXIzsDzJCROoT5IOWUOHQgH/K8ltSTa2vuOr6jGA9nxc618JPDq27fbW9zxJNiaZTTI7Nzc3YOmSdOhZNvD+31xVO5IcB9yQ5It7GZsF+mq3jqrLgcsB1q5du9t6SdILN+hMoap2tOedwKeANwFfmz8t1J53tuHbgRPHNl8F7BiyPknS8w0WCklekeSV823gx4F7gC3A+jZsPXBta28B3t3ehXQ68NT8aSZJ0tIY8vTR8cCnksz/nKur6o+TfB64JskG4BHgvDb+euBsYBvwNHDBgLVJkhYwWChU1UPAqQv0fx04Y4H+Ai4cqh5J0r75iWZJUsdQkCR1DAVJUmfozylI0uBmNl03kZ/78EXnTOTnDsmZgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSp0ysUkvy1oQuRJE1e35nCbyW5NcnPJVk+aEWSpInpFQpV9XeAnwZOBGaTXJ3k7w1amSRpyfW+plBVDwL/Dngv8Bbg0iRfTPKTQxUnSVpafa8p/PUklwD3A28F3lZVJ7f2JQPWJ0laQn1nCv8V+AJwalVdWFVfAKiqHYxmD3uU5PAktyf5o7Z8UpJbkjyY5A+SHNH6X9qWt7X1My/0l5IkvTB9Q+Fs4Oqq+jZAksOSvBygqq7ax7a/wGiGMe9i4JKqWg08AWxo/RuAJ6rq+xnNPi7uWZskaZH0DYVPAy8bW35569urJKuAc4APt+UwOuX0iTZkM3Bua69ry7T1Z7TxkqQl0jcUjqyqb84vtPbLe2z3IeDfAn/Zll8NPFlVz7bl7cDK1l4JPNr2/yzwVBv/PEk2JplNMjs3N9ezfElSH31D4VtJ1swvJPkbwLf3tkGSvw/srKrbxrsXGFo91n2vo+ryqlpbVWtXrFix78olSb0t6znuF4GPJ9nRlk8A/uE+tnkz8PYkZwNHAkcxmjksT7KszQZWAfP73M7ocxDbkywDXgU83vs3kSQdsL4fXvs88APAzwI/B5y8ywxgoW3eV1WrqmoGOB/4TFX9NHAj8M42bD1wbWtvacu09Z+pqt1mCpKk4fSdKQD8EDDTtjktCVV15Qv4me8FPpbk14DbgSta/xXAVUm2MZohnP8C9i1JOgC9QiHJVcD3AXcAz7XuAnqFQlXdBNzU2g8Bb1pgzHeA8/rsT5I0jL4zhbXAKZ7OkaSDW993H90D/NUhC5EkTV7fmcKxwH1JbgW+O99ZVW8fpCpJ0kT0DYUPDFmEJGk69AqFqvpsktcCq6vq0+2+R4cPW5okaan1vXX2exjdj+i3W9dK4A+HKkqSNBl9LzRfyOgTyt+A7gt3jhuqKEnSZPQNhe9W1TPzC+02FL49VZIOMn1D4bNJfhl4Wftu5o8D/2O4siRJk9A3FDYBc8DdwL8Armcf37gmSXrx6fvuo78Efqc9JEkHqb73PvoKC3+3wesWvSJJ0sTsz72P5h3J6MZ1xyx+OZKkSer7fQpfH3t8tao+xOi7liVJB5G+p4/WjC0exmjm8MpBKpIkTUzf00f/eaz9LPAw8FOLXo0kaaL6vvvoR4cuRJI0eX1PH/3S3tZX1QcXpxxJ0iTtz7uPfgjY0pbfBnwOeHSIoiRJk7E/X7Kzpqr+HCDJB4CPV9U/H6owSdLS63ubi9cAz4wtPwPMLHo1kqSJ6jtTuAq4NcmnGH2y+R3AlYNVJUmaiL4fXvt14ALgCeBJ4IKq+o972ybJkUluTXJnknuT/IfWf1KSW5I8mOQPkhzR+l/alre19TMH8otJkvZf39NHAC8HvlFVvwFsT3LSPsZ/F3hrVZ0KvBE4M8npwMXAJVW1mlHIbGjjNwBPVNX3A5e0cZKkJdT36zh/BXgv8L7W9RLgv+1tmxr55tj4lzA69fRWRl/tCbAZOLe117Vl2vozkqRPfZKkxdF3pvAO4O3AtwCqagc9bnOR5PAkdwA7gRuALwNPVtWzbch2Rt/3THt+tO3/WeAp4NU965MkLYK+ofBMVRXt9tlJXtFno6p6rqreCKwC3gScvNCw9rzQrGC323Un2ZhkNsns3Nxcr+IlSf30DYVrkvw2sDzJe4BPsx9fuFNVTwI3Aae3fcy/62kVsKO1twMnQvcd0K8CHl9gX5dX1dqqWrtixYq+JUiSeuj77qP/xOg8/yeBNwD/vqr+y962SbIiyfLWfhnwY8D9wI3AO9uw9cC1rb2lLdPWf6bNTiRJS2Sfn1NIcjjwJ1X1Y4yuC/R1ArC5bX8YcE1V/VGS+4CPJfk14Hbgijb+CuCqJNsYzRDO34+fJUlaBPsMhap6LsnTSV5VVU/13XFV3QWctkD/Q4yuL+za/x1G3+gmSZqQvp9o/g5wd5IbaO9AAqiqfzVIVZKkiegbCte1hyTpILbXUEjymqp6pKo2722cJOngsK93H/3hfCPJJweuRZI0YfsKhfEPlL1uyEIkSZO3r1CoPbQlSQehfV1oPjXJNxjNGF7W2rTlqqqjBq1OkrSk9hoKVXX4UhUiSZq8/fk+BUnSQc5QkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1BguFJCcmuTHJ/UnuTfILrf+YJDckebA9H936k+TSJNuS3JVkzVC1SZIWNuRM4VngX1fVycDpwIVJTgE2AVurajWwtS0DnAWsbo+NwGUD1iZJWsBgoVBVj1XVF1r7z4H7gZXAOmBzG7YZOLe11wFX1sjNwPIkJwxVnyRpd0tyTSHJDHAacAtwfFU9BqPgAI5rw1YCj45ttr31SZKWyOChkOSvAJ8EfrGqvrG3oQv01QL725hkNsns3NzcYpUpSWLgUEjyEkaB8PtV9d9b99fmTwu1552tfztw4tjmq4Adu+6zqi6vqrVVtXbFihXDFS9Jh6Ah330U4Arg/qr64NiqLcD61l4PXDvW/+72LqTTgafmTzNJkpbGsgH3/WbgnwB3J7mj9f0ycBFwTZINwCPAeW3d9cDZwDbgaeCCAWuTJC1gsFCoqv/NwtcJAM5YYHwBFw5VjyRp3/xEsySpYyhIkjqGgiSpYyhIkjpDvvtIkg5qM5uum9jPfviicwbZrzMFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQYLhSQfSbIzyT1jfcckuSHJg+356NafJJcm2ZbkriRrhqpLkrRnQ84Ufg84c5e+TcDWqloNbG3LAGcBq9tjI3DZgHVJkvZgsFCoqs8Bj+/SvQ7Y3NqbgXPH+q+skZuB5UlOGKo2SdLClvqawvFV9RhAez6u9a8EHh0bt7317SbJxiSzSWbn5uYGLVaSDjXTcqE5C/TVQgOr6vKqWltVa1esWDFwWZJ0aFnqUPja/Gmh9ryz9W8HThwbtwrYscS1SdIhb6lDYQuwvrXXA9eO9b+7vQvpdOCp+dNMkqSls2yoHSf5KPAjwLFJtgO/AlwEXJNkA/AIcF4bfj1wNrANeBq4YKi6JEl7NlgoVNW79rDqjAXGFnDhULVIh5KZTddNugS9iE3LhWZJ0hQwFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJnakKhSRnJnkgybYkmyZdjyQdaqYmFJIcDvwmcBZwCvCuJKdMtipJOrQsm3QBY94EbKuqhwCSfAxYB9w3xA+b2XTdELudag9fdM6kS1hyh+LfWToQ0xQKK4FHx5a3A39z10FJNgIb2+I3kzywBLX1dSzwZ5MuYk9yMTDlNWJ9i2Haa5z2+mD6azw2Fx9Qfa/d04ppCoUs0Fe7dVRdDlw+fDn7L8lsVa2ddB17M+01Wt+Bm/Yap70+mP4ah6xvaq4pMJoZnDi2vArYMaFaJOmQNE2h8HlgdZKTkhwBnA9smXBNknRImZrTR1X1bJJ/CfwJcDjwkaq6d8Jl7a+pPK21i2mv0foO3LTXOO31wfTXOFh9qdrttL0k6RA1TaePJEkTZihIkjqGQk99bsGR5KeS3Jfk3iRXj/U/l+SO9hjk4vm+6ktyyVgNX0ry5Ni69UkebI/1Q9S3CDVOwzF8TZIbk9ye5K4kZ4+te1/b7oEkPzFN9SWZSfLtseP3W0PU17PG1ybZ2uq7KcmqsXWDvw4PsL6leA1+JMnOJPfsYX2SXNrqvyvJmrF1i3P8qsrHPh6MLnx/GXgdcARwJ3DKLmNWA7cDR7fl48bWfXPS9e0y/ucZXcgHOAZ4qD0f3dpHT1ON03IMGV3c+9nWPgV4eKx9J/BS4KS2n8OnqL4Z4J4hj99+1PhxYH1rvxW4aqlehwdS31K8BtvP+GFgzZ7+XsDZwP9k9Lmu04FbFvv4OVPop7sFR1U9A8zfgmPce4DfrKonAKpq55TVN+5dwEdb+yeAG6rq8Vb7DcCZU1bjUuhTXwFHtfar+N7naNYBH6uq71bVV4BtbX/TUt9S6VPjKcDW1r5xbP1SvA4PpL4lUVWfAx7fy5B1wJU1cjOwPMkJLOLxMxT6WegWHCt3GfN64PVJ/jTJzUnG/yBHJplt/edOqD5gND1m9K/Zz+zvthOsEabjGH4A+MdJtgPXM5rN9N12kvUBnNROK302yd9d5Nr2p8Y7gX/Q2u8AXpnk1T23nWR9MPxrsI89/Q6LdvwMhX763IJjGaNTSD/C6F+5H06yvK17TY0+kv6PgA8l+b4J1DfvfOATVfXcC9j2QBxIjTAdx/BdwO9V1SpG0/irkhzWc9tJ1vcYo+N3GvBLwNVJjmLx9anx3wBvSXI78Bbgq8CzPbc9UAdSHwz/GuxjT7/Doh0/Q6GfPrfg2A5cW1V/0U4hPMAoJKiqHe35IeAm4LQJ1DfvfJ5/Wmapbi9yIDVOyzHcAFzT6vg/wJGMbpy2FMfwBdfXTmt9vfXfxui8+usXub5eNVbVjqr6yRZQ7299T/XZdsL1LcVrsI89/Q6Ld/yGvnByMDwYzQIeYnRKY/4C1Q/uMuZMYHNrH8toKvdqRhd9XjrW/yB7ucA6VH1t3BuAh2kfWqzvXaD6Sqvz6NY+ZhLHcC81TsUxZHSB75+29sntP7oAP8jzLzQ/xOJfaD6Q+lbM18PoIutXJ/U3bn+/w1r714FfXarX4QHWN/hrcKyGGfZ8ofkcnn+h+dbFPn6L/gsdrA9G0/EvMfpX1vtb368Cb2/tAB9k9P0PdwPnt/6/3ZbvbM8bJlFfW/4AcNEC2/4zRhdHtwEXTOoY7qnGaTmGjC5C/mmr4w7gx8e2fX/b7gHgrGmqj9E58ntb/xeAt03qbwy8s/0P9UvAh2n/o12q1+ELrW8JX4MfZXS67y8Y/et/A/AzwM+09WH0ZWRfbnWsXezj520uJEkdrylIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjr/H+ez4bfNKr74AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "df_trn['收率'].plot(kind='hist')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trn = df_trn.query('收率 > 0.8671').reset_index(drop=True)  # 筛选常规数据\n",
    "df_trn = df_trn.query('收率 < 0.9861').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAATa0lEQVR4nO3df7BfdX3n8eeL8FtUoFxYTMCLNnaNnRpsZNl1XSnWVnEt2pYK3dbIsqaz4rbOup1G1lnZzjJDd1W2jjuucVEjW0vxR5UtWA0s1mkHxCDIr2iJmIWYDEkriogFwff+8f3c4zW5Sb7Jvef7vTf3+Zj5zvdzPuec73l/5t7kdc+P7zmpKiRJAjhk3AVIkuYPQ0GS1DEUJEkdQ0GS1DEUJEmdQ8ddwGyccMIJNTk5Oe4yJGlBue222/6uqiZmmregQ2FycpKNGzeOuwxJWlCS/L89zfPwkSSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSps6C/0az9M7n2urFte8vlrx7btiUNzz0FSVLHUJAkdQwFSVLHUJAkdQwFSVKnt1BIcmSSW5N8Nck9Sf5z6z8tyZeS3Jfkz5Ic3vqPaNOb2/zJvmqTJM2szz2Fx4Gzq+qFwErglUnOBP4IuKKqlgMPAxe15S8CHq6qnwauaMtJkkaot1CogUfb5GHtVcDZwCda/3rgta19bpumzX95kvRVnyRpd72eU0iyJMkdwA5gA/AN4DtV9WRbZCuwtLWXAg8CtPnfBX6qz/okST+p11CoqqeqaiWwDDgDeP5Mi7X3mfYKateOJGuSbEyycefOnXNXrCRpNFcfVdV3gC8AZwLHJpm6vcYyYFtrbwVOAWjznwl8e4bPWldVq6pq1cTERN+lS9Ki0ufVRxNJjm3to4BfBDYBNwG/3hZbDXymta9t07T5/7eqdttTkCT1p88b4p0MrE+yhEH4XFNVf5HkXuDqJP8FuB24si1/JXBVks0M9hDO77E2SdIMeguFqroTOH2G/vsZnF/Ytf8fgPP6qkeStG9+o1mS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEmdPu99JC1qk2uvG8t2t1z+6rFsVwcH9xQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSZ3eQiHJKUluSrIpyT1Jfq/1X5rkW0nuaK9zpq3z9iSbk3w9yS/3VZskaWZ93jr7SeBtVfWVJE8Hbkuyoc27oqreNX3hJCuA84EXAM8CbkjyvKp6qscaJUnT9LanUFXbq+orrf09YBOwdC+rnAtcXVWPV9U3gc3AGX3VJ0na3UjOKSSZBE4HvtS63pLkziQfSnJc61sKPDhtta3MECJJ1iTZmGTjzp07e6xakhaf3kMhyTHAJ4G3VtUjwPuB5wIrge3Au6cWnWH12q2jal1VraqqVRMTEz1VLUmLU6+hkOQwBoHwJ1X1KYCqeqiqnqqqHwEf5MeHiLYCp0xbfRmwrc/6JEk/qc+rjwJcCWyqqvdM6z952mKvA+5u7WuB85MckeQ0YDlwa1/1SZJ21+fVRy8Bfhu4K8kdre8S4IIkKxkcGtoC/A5AVd2T5BrgXgZXLl3slUeSNFq9hUJV/TUznye4fi/rXAZc1ldNkqS98xvNkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6vQWCklOSXJTkk1J7knye63/+CQbktzX3o9r/Uny3iSbk9yZ5EV91SZJmlmfewpPAm+rqucDZwIXJ1kBrAVurKrlwI1tGuBVwPL2WgO8v8faJEkzGCoUkvzs/n5wVW2vqq+09veATcBS4FxgfVtsPfDa1j4X+GgN3AIcm+Tk/d2uJOnADbun8D+T3JrkzUmO3d+NJJkETge+BJxUVdthEBzAiW2xpcCD01bb2vp2/aw1STYm2bhz5879LUWStBdDhUJV/XPgXwGnABuTfCzJK4ZZN8kxwCeBt1bVI3tbdKZNz1DLuqpaVVWrJiYmhilBkjSkoc8pVNV9wDuAPwBeBrw3ydeS/Oqe1klyGINA+JOq+lTrfmjqsFB739H6tzIInSnLgG3D1idJmr1hzyn8XJIrGJwXOBt4TTuBfDZwxR7WCXAlsKmq3jNt1rXA6tZeDXxmWv8b2lVIZwLfnTrMJEkajUOHXO59wAeBS6rqB1OdVbUtyTv2sM5LgN8G7kpyR+u7BLgcuCbJRcADwHlt3vXAOcBm4DHgwv0ZiCRp9oYNhXOAH1TVUwBJDgGOrKrHquqqmVaoqr9m5vMEAC+fYfkCLh6yHklSD4Y9p3ADcNS06aNbnyTpIDJsKBxZVY9OTbT20f2UJEkal2FD4fvTbzuR5OeBH+xleUnSAjTsOYW3Ah9PMnWJ6MnA6/spSZI0LkOFQlV9Ock/Bn6Gwcnjr1XVD3utTJI0csPuKQC8GJhs65yehKr6aC9VSZLGYqhQSHIV8FzgDuCp1l2AoSBJB5Fh9xRWASvadwkkSQepYa8+uhv4R30WIkkav2H3FE4A7k1yK/D4VGdV/UovVUmSxmLYULi0zyIkSfPDsJek/lWSZwPLq+qGJEcDS/otTZI0asPeOvtNwCeAD7SupcCn+ypKkjQew55ovpjBrbAfge6BOyfudQ1J0oIzbCg8XlVPTE0kOZQZHpUpSVrYhg2Fv0pyCXBUezbzx4H/019ZkqRxGDYU1gI7gbuA32HwlLQ9PXFNkrRADXv10Y8YPI7zg/2WI0kap2HvffRNZjiHUFXPmfOKJEljsz/3PppyJHAecPzclyNJGqehzilU1d9Pe32rqv47cHbPtUmSRmzYw0cvmjZ5CIM9h6f3UpEkaWyGPXz07mntJ4EtwG/MeTWSpLEa9uqjX+i7EEnS+A17+Ojf721+Vb1nhnU+BPxLYEdV/WzruxR4E4PvPABcUlXXt3lvBy5i8GS3362qzw05BknSHNmfq49eDFzbpl8DfBF4cC/rfAR4H7s/svOKqnrX9I4kK4DzgRcAzwJuSPK8qnoKSftlcu11Y9nulstfPZbtam7tz0N2XlRV34PuL/6PV9W/2dMKVfXFJJNDfv65wNVV9TjwzSSbgTOAm4dcX5I0B4YNhVOBJ6ZNPwFMHuA235LkDcBG4G1V9TCDW3HfMm2Zra1vN0nWAGsATj311AMsYbzG9ZecJO3LsPc+ugq4NcmlSd4JfIndDwsN4/3Ac4GVwHZ+fFVTZlh2xruwVtW6qlpVVasmJiYOoARJ0p4Me/XRZUk+C7y0dV1YVbfv78aq6qGpdpIPAn/RJrcCp0xbdBmwbX8/X5I0O8PuKQAcDTxSVX8MbE1y2v5uLMnJ0yZfB9zd2tcC5yc5on3ucuDW/f18SdLsDHtJ6jsZXIH0M8CHgcOA/83gaWx7WudPgbOAE5JsBd4JnJVkJYNDQ1sY3IabqronyTXAvQy+HHexVx5J0ugNe6L5dcDpwFcAqmpbkr3e5qKqLpih+8q9LH8ZcNmQ9UiSejBsKDxRVZWkAJI8rceapDnjlV7S/hn2nMI1ST4AHJvkTcAN+MAdSTroDHv10bvas5kfYXBe4T9V1YZeK5Mkjdw+QyHJEuBzVfWLgEEgSQexfR4+alcBPZbkmSOoR5I0RsOeaP4H4K4kG4DvT3VW1e/2UpUkaSyGDYXr2kuSdBDbaygkObWqHqiq9aMqSJI0Pvs6p/DpqUaST/ZciyRpzPYVCtPvXvqcPguRJI3fvkKh9tCWJB2E9nWi+YVJHmGwx3BUa9Omq6qe0Wt1kqSR2msoVNWSURUiSRq//XmegiTpIGcoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqdNbKCT5UJIdSe6e1nd8kg1J7mvvx7X+JHlvks1J7kzyor7qkiTtWZ97Ch8BXrlL31rgxqpaDtzYpgFeBSxvrzXA+3usS5K0B72FQlV9Efj2Lt3nAlOP9lwPvHZa/0dr4Bbg2CQn91WbJGlmoz6ncFJVbQdo7ye2/qXAg9OW29r6dpNkTZKNSTbu3Lmz12IlabGZLyeaM0PfjE96q6p1VbWqqlZNTEz0XJYkLS6jDoWHpg4LtfcdrX8rcMq05ZYB20ZcmyQteqMOhWuB1a29GvjMtP43tKuQzgS+O3WYSZI0Ovt6RvMBS/KnwFnACUm2Au8ELgeuSXIR8ABwXlv8euAcYDPwGHBhX3VJkvast1Coqgv2MOvlMyxbwMV91SJJGs58OdEsSZoHDAVJUqe3w0eSNCqTa68by3a3XP7qsWy3T+4pSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6i/Z7CuO6rlmS5jP3FCRJHUNBktQxFCRJnUV7TkGSZmuc5yb7uu+SewqSpI6hIEnqePhII+ElwNLC4J6CJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOmO5JDXJFuB7wFPAk1W1KsnxwJ8Bk8AW4Deq6uFx1CdJi9U49xR+oapWVtWqNr0WuLGqlgM3tmlJ0gjNp8NH5wLrW3s98Nox1iJJi9K4QqGAzye5Lcma1ndSVW0HaO8nzrRikjVJNibZuHPnzhGVK0mLw7huc/GSqtqW5ERgQ5KvDbtiVa0D1gGsWrWq+ipQkhajsewpVNW29r4D+HPgDOChJCcDtPcd46hNkhazkYdCkqclefpUG/gl4G7gWmB1W2w18JlR1yZJi904Dh+dBPx5kqntf6yq/jLJl4FrklwEPACcN4baJGlRG3koVNX9wAtn6P974OWjrkeS9GPz6ZJUSdKY+ZAdSXPCBykdHNxTkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUmfehUKSVyb5epLNSdaOux5JWkzmVSgkWQL8D+BVwArggiQrxluVJC0e8yoUgDOAzVV1f1U9AVwNnDvmmiRp0Th03AXsYinw4LTprcA/mb5AkjXAmjb5aJKvj6i2uXYC8HfjLmIOOZ75zfHMb/s9nvzRrLb37D3NmG+hkBn66icmqtYB60ZTTn+SbKyqVeOuY644nvnN8cxv82k88+3w0VbglGnTy4BtY6pFkhad+RYKXwaWJzktyeHA+cC1Y65JkhaNeXX4qKqeTPIW4HPAEuBDVXXPmMvqy4I/BLYLxzO/OZ75bd6MJ1W176UkSYvCfDt8JEkaI0NBktQxFObYvm7TkeTUJDcluT3JnUnOaf2HJVmf5K4km5K8ffTV726I8Tw7yY1tLF9IsmzavNVJ7muv1aOtfGYHOp4kK5PcnOSeNu/1o69+d7P5+bT5z0jyrSTvG13VezbL37dTk3y+/fu5N8nkKGufySzH81/b79umJO9NMtMl+3OvqnzN0YvByfFvAM8BDge+CqzYZZl1wL9t7RXAltb+TeDq1j4a2AJMLoDxfBxY3dpnA1e19vHA/e39uNY+bgGP53nA8tZ+FrAdOHahjmfa/D8GPga8b5xjmYvxAF8AXtHaxwBHL9TxAP8M+Jv2GUuAm4GzRlG3ewpza5jbdBTwjNZ+Jj/+HkYBT0tyKHAU8ATwSP8l79Uw41kB3NjaN02b/8vAhqr6dlU9DGwAXjmCmvfmgMdTVX9bVfe19jZgBzAxkqr3bDY/H5L8PHAS8PkR1DqMAx5Pu0faoVW1AaCqHq2qx0ZT9h7N5udTwJEMwuQI4DDgod4rxsNHc22m23Qs3WWZS4HfSrIVuB74d63/E8D3GfwF+gDwrqr6dq/V7tsw4/kq8Gut/Trg6Ul+ash1R2024+kkOYPBP9Zv9FTnsA54PEkOAd4N/H7vVQ5vNj+f5wHfSfKpdmj2v7UbbI7TAY+nqm5mEBLb2+tzVbWp53oBQ2Gu7fM2HcAFwEeqahlwDnBV+wd6BvAUg0MTpwFvS/KcPosdwjDj+Q/Ay5LcDrwM+Bbw5JDrjtpsxjP4gORk4Crgwqr6UV+FDmk243kzcH1VPcj8MZvxHAq8tM1/MYNDNm/srdLhHPB4kvw08HwGd3VYCpyd5F/0WeyUefXltYPAMLfpuIh2GKWqbk5yJIObYf0m8JdV9UNgR5K/AVYxOBY/LvscTzuU8qsASY4Bfq2qvtv2hM7aZd0v9FnsEA54PG36GcB1wDuq6paRVLx3s/n5/FPgpUnezOD4++FJHq2qcT7DZLa/b7dX1f1t3qeBM4ErR1H4HsxmPGuAW6rq0TbvswzG88Xeqx7niZiD7cUgZO9n8Jf+1ImlF+yyzGeBN7b289svSYA/AD7c2k8D7gV+bgGM5wTgkNa+DPjD1j4e+CaDk8zHtfbxC3g8hzM49vvWcf+ezcV4dlnmjcyPE82z+fksactPtOkPAxcv4PG8HrihfcZh7XfvNSOpe9y/CAfbi8Ehob9lcLz5P7a+PwR+pbVXMLiq4KvAHcAvtf5jGFyJcE8LhN8f91iGHM+vA/e1Zf4XcMS0df81sLm9Lhz3WGYzHuC3gB+2n9nUa+VCHc8unzEvQmEOft9eAdwJ3AV8BDh8oY6HQch9ANjU/j94z6hq9jYXkqSOJ5olSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSZ3/DzGuKrVk9SkXAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "df_trn['收率'].plot(kind='hist')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def xgb_cv(train, test, params, fit_params, feature_names, nfold, seed):\n",
    "    \"\"\"\n",
    "    train训练数据\n",
    "    test测试数据\n",
    "    params参数\n",
    "    fit_params训练参数\n",
    "    feature_names特征名\n",
    "    nfold几折交叉\n",
    "    seed随机种子\n",
    "    \"\"\"\n",
    "    # 创建结果df\n",
    "    train_pred = pd.DataFrame({\n",
    "        'id': train['样本id'],\n",
    "        'true': train['收率'],\n",
    "        'pred': np.zeros(len(train))})\n",
    "    # 测试提交结果\n",
    "    test_pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))})\n",
    "    # 交叉验证\n",
    "    kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)\n",
    "    # 构造测试DMatrix\n",
    "    xgb_tst = xgb.DMatrix(data=test[feature_names])\n",
    "    print('\\n')\n",
    "    # 遍历cv中每一折数据，通过索引来指定\n",
    "    for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])):\n",
    "        # 构造当前训练的DMatrix\n",
    "        xgb_trn = xgb.DMatrix(\n",
    "            train.iloc[trn_idx][feature_names],\n",
    "            train.iloc[trn_idx]['收率'])\n",
    "        # 构造当前验证的DMatrix\n",
    "        xgb_val = xgb.DMatrix(\n",
    "            train.iloc[val_idx][feature_names],\n",
    "            train.iloc[val_idx]['收率'])\n",
    "        # 训练回归模型\n",
    "        xgb_reg = xgb.train(params=params, dtrain=xgb_trn, **fit_params,\n",
    "                  evals=[(xgb_trn, 'train'), (xgb_val, 'valid')])\n",
    "        # 得到验证结果\n",
    "        val_pred = xgb_reg.predict(\n",
    "            xgb.DMatrix(train.iloc[val_idx][feature_names]),\n",
    "            ntree_limit=xgb_reg.best_ntree_limit)\n",
    "        train_pred.loc[val_idx, 'pred'] = val_pred\n",
    "        # print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred))\n",
    "        test_pred['pred'] += xgb_reg.predict(\n",
    "            xgb_tst, ntree_limit=xgb_reg.best_ntree_limit) / nfold\n",
    "    print('\\nCV LOSS:', mse(train_pred['true'], train_pred['pred']), '\\n')\n",
    "    return test_pred\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置训练参数\n",
    "fit_params = {'num_boost_round': 10800,\n",
    "              'verbose_eval': 300,\n",
    "              'early_stopping_rounds': 360}\n",
    "params_xgb = {'eta': 0.01, 'max_depth': 7, 'subsample': 0.8,\n",
    "              'booster': 'gbtree', 'colsample_bytree': 0.8,\n",
    "              'objective': 'reg:linear', 'silent': True, 'nthread': 4}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "[0]\ttrain-rmse:0.42052\tvalid-rmse:0.417952\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023717\tvalid-rmse:0.023667\n",
      "[600]\ttrain-rmse:0.00645\tvalid-rmse:0.011488\n",
      "[900]\ttrain-rmse:0.004691\tvalid-rmse:0.011727\n",
      "Stopping. Best iteration:\n",
      "[600]\ttrain-rmse:0.00645\tvalid-rmse:0.011488\n",
      "\n",
      "[0]\ttrain-rmse:0.419812\tvalid-rmse:0.420785\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.02374\tvalid-rmse:0.025614\n",
      "[600]\ttrain-rmse:0.006597\tvalid-rmse:0.01204\n",
      "[900]\ttrain-rmse:0.004692\tvalid-rmse:0.01197\n",
      "Stopping. Best iteration:\n",
      "[810]\ttrain-rmse:0.005159\tvalid-rmse:0.011948\n",
      "\n",
      "[0]\ttrain-rmse:0.419963\tvalid-rmse:0.420191\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023604\tvalid-rmse:0.025064\n",
      "[600]\ttrain-rmse:0.006202\tvalid-rmse:0.01245\n",
      "[900]\ttrain-rmse:0.004472\tvalid-rmse:0.012215\n",
      "[1200]\ttrain-rmse:0.003453\tvalid-rmse:0.012209\n",
      "Stopping. Best iteration:\n",
      "[1062]\ttrain-rmse:0.003866\tvalid-rmse:0.012199\n",
      "\n",
      "[0]\ttrain-rmse:0.420254\tvalid-rmse:0.419025\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.02381\tvalid-rmse:0.024832\n",
      "[600]\ttrain-rmse:0.006467\tvalid-rmse:0.010957\n",
      "[900]\ttrain-rmse:0.004619\tvalid-rmse:0.010752\n",
      "[1200]\ttrain-rmse:0.003542\tvalid-rmse:0.010815\n",
      "Stopping. Best iteration:\n",
      "[873]\ttrain-rmse:0.004751\tvalid-rmse:0.01075\n",
      "\n",
      "[0]\ttrain-rmse:0.419487\tvalid-rmse:0.422069\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023859\tvalid-rmse:0.024063\n",
      "[600]\ttrain-rmse:0.006739\tvalid-rmse:0.0102\n",
      "[900]\ttrain-rmse:0.004817\tvalid-rmse:0.010053\n",
      "[1200]\ttrain-rmse:0.003709\tvalid-rmse:0.010089\n",
      "Stopping. Best iteration:\n",
      "[872]\ttrain-rmse:0.004956\tvalid-rmse:0.010047\n",
      "\n",
      "\n",
      "CV LOSS: 0.0001280110217167903 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 开始训练\n",
    "pred_xgb_a = xgb_cv(df_trn, df_tst, \n",
    "                    params_xgb, fit_params,\n",
    "                    df_trn.columns.tolist()[1:-1], 5, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 得到预测结果\n",
    "df_tst_a['收率'] = pred_xgb_a['pred'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9</th>\n",
       "      <th>...</th>\n",
       "      <th>B6</th>\n",
       "      <th>B7</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9</th>\n",
       "      <th>B10</th>\n",
       "      <th>B11</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1656</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>6:00:00</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>17:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>17:00-18:30</td>\n",
       "      <td>18:30-20:00</td>\n",
       "      <td>20:00-21:00</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.905793</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1548</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>12:30:00</td>\n",
       "      <td>39</td>\n",
       "      <td>12:50:00</td>\n",
       "      <td>80.0</td>\n",
       "      <td>14:20:00</td>\n",
       "      <td>...</td>\n",
       "      <td>65</td>\n",
       "      <td>10:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>14:00-15:30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800</td>\n",
       "      <td>0.15</td>\n",
       "      <td>385</td>\n",
       "      <td>0.879575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_769</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>6:00:00</td>\n",
       "      <td>80</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>17:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>17:00-20:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>440</td>\n",
       "      <td>0.934695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_1881</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>9:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>9:00-10:30</td>\n",
       "      <td>10:30-12:00</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.903490</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_1807</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>9:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>9:00-10:30</td>\n",
       "      <td>10:30-12:00</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.928534</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 44 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4        A5  A6        A7    A8        A9  \\\n",
       "0  sample_1656  300 NaN  405.0  700   6:00:00  29       NaN   NaN   8:00:00   \n",
       "1  sample_1548  300 NaN  405.0  700  12:30:00  39  12:50:00  80.0  14:20:00   \n",
       "2   sample_769  300 NaN  405.0  700   6:00:00  80       NaN   NaN   8:00:00   \n",
       "3  sample_1881  300 NaN  405.0  700  22:00:00  29       NaN   NaN   0:00:00   \n",
       "4  sample_1807  300 NaN  405.0  700  22:00:00  30       NaN   NaN   0:00:00   \n",
       "\n",
       "   ...  B6        B7  B8           B9          B10          B11   B12   B13  \\\n",
       "0  ...  79  17:00:00  45  17:00-18:30  18:30-20:00  20:00-21:00  1200  0.15   \n",
       "1  ...  65  10:00:00  45  12:00-13:00  14:00-15:30          NaN   800  0.15   \n",
       "2  ...  80  17:00:00  45  17:00-20:00          NaN          NaN  1200  0.15   \n",
       "3  ...  80   9:00:00  45   9:00-10:30  10:30-12:00  12:00-13:00  1200  0.15   \n",
       "4  ...  79   9:00:00  45   9:00-10:30  10:30-12:00  12:00-13:00  1200  0.15   \n",
       "\n",
       "   B14        收率  \n",
       "0  400  0.905793  \n",
       "1  385  0.879575  \n",
       "2  440  0.934695  \n",
       "3  400  0.903490  \n",
       "4  400  0.928534  \n",
       "\n",
       "[5 rows x 44 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tst_a.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
